knitr::opts_chunk$set(fig.align="center")
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr)
library(ggplot2)
library(magrittr)
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)
theme_set(theme_light())
In our experiement, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then measured the user’s time to complete each of four tasks: 1. Find Extremum 2. Retrieve Value 3. Prediction 4. Exploration
Given a search algorithm (bsf or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict the time it takes the average user to complete each task. In addition, we would like to know if the choice of search algorithm and oracle has any meaninful impact on a user’s completion time for each of these four tasks,
time_data = read.csv('processed_completion_time.csv')
time_data <- time_data %>%
mutate(
dataset = as.factor(dataset),
oracle = as.factor(oracle),
search = as.factor(search),
log_completion_time = log(completion_time)
)
Units on these are log seconds. We choose to work in log space in order to prevent our model from predicting completion times of less than zero seconds.
prior_mean = 5.69
prior_sd = 0.69
models <- list()
draw_data <- list()
search_differences <- list()
oracle_differences <- list()
seed = 12
task_list = c("1. Find Extremum",
"2. Retrieve Value",
"3. Prediction",
"4. Exploration")
data_find_extremum <- subset(time_data, task == "1. Find Extremum")
models$find_extremum <- stan_glm(
log_completion_time ~ dataset * oracle * search,
data = data_find_extremum,
prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
prior = normal(0, prior_sd, autoscale = FALSE),
seed = seed
)
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 9.5e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.95 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.125642 seconds (Warm-up)
## Chain 1: 0.113495 seconds (Sampling)
## Chain 1: 0.239137 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 1.1e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.11 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.097877 seconds (Warm-up)
## Chain 2: 0.08048 seconds (Sampling)
## Chain 2: 0.178357 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 1.7e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.17 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.087203 seconds (Warm-up)
## Chain 3: 0.078331 seconds (Sampling)
## Chain 3: 0.165534 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 9e-06 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.082043 seconds (Warm-up)
## Chain 4: 0.078863 seconds (Sampling)
## Chain 4: 0.160906 seconds (Total)
## Chain 4:
In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.
summary(models$find_extremum)
##
## Model Info:
## function: stan_glm
## family: gaussian [identity]
## formula: log_completion_time ~ dataset * oracle * search
## algorithm: sampling
## sample: 4000 (posterior sample size)
## priors: see help('prior_summary')
## observations: 59
## predictors: 8
##
## Estimates:
## mean sd 10% 50% 90%
## (Intercept) 5.5 0.2 5.3 5.5 5.7
## datasetmovies -0.2 0.2 -0.5 -0.2 0.1
## oracledziban 0.0 0.2 -0.3 0.0 0.3
## searchdfs 0.1 0.2 -0.2 0.1 0.4
## datasetmovies:oracledziban 0.1 0.3 -0.3 0.1 0.5
## datasetmovies:searchdfs -0.2 0.3 -0.6 -0.2 0.2
## oracledziban:searchdfs -0.3 0.3 -0.7 -0.3 0.1
## datasetmovies:oracledziban:searchdfs 0.0 0.4 -0.5 0.0 0.5
## sigma 0.6 0.1 0.5 0.6 0.6
##
## Fit Diagnostics:
## mean sd 10% 50% 90%
## mean_PPD 5.3 0.1 5.2 5.3 5.5
##
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
##
## MCMC diagnostics
## mcse Rhat n_eff
## (Intercept) 0.0 1.0 2520
## datasetmovies 0.0 1.0 2082
## oracledziban 0.0 1.0 2414
## searchdfs 0.0 1.0 1980
## datasetmovies:oracledziban 0.0 1.0 2048
## datasetmovies:searchdfs 0.0 1.0 1934
## oracledziban:searchdfs 0.0 1.0 2233
## datasetmovies:oracledziban:searchdfs 0.0 1.0 2194
## sigma 0.0 1.0 3559
## mean_PPD 0.0 1.0 4392
## log-posterior 0.1 1.0 1513
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
Trace plots help us check whether there is evidence of non-convergence for model.
plot(models$find_extremum)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(models$find_extremum)
Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data$find_extremum <- data_find_extremum %>%
add_fitted_draws(models$find_extremum, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_find_extremum = ggplot(draw_data$find_extremum, aes(
x = exp(.value),
y = condition,
fill = dataset,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Time (Seconds)", y = "Condition")
plot_find_extremum
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_find_extremum <- draw_data$find_extremum %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_find_extremum
## # A tibble: 118 x 16
## # Groups: participant_id, dataset, oracle, search, task, completion_time,
## # condition, task_type, log_completion_time [59]
## participant_id dataset oracle search task completion_time condition
## <int> <fct> <fct> <fct> <chr> <dbl> <chr>
## 1 1 birdst… compa… bfs 1. F… 420. compassq…
## 2 2 birdst… compa… bfs 1. F… 365. compassq…
## 3 3 birdst… compa… bfs 1. F… 300. compassq…
## 4 4 movies compa… dfs 1. F… 64.1 compassq…
## 5 5 movies dziban bfs 1. F… 229. dziban_b…
## 6 6 birdst… compa… dfs 1. F… 189. compassq…
## 7 7 movies compa… dfs 1. F… 97.4 compassq…
## 8 8 movies dziban dfs 1. F… 379. dziban_d…
## 9 9 movies dziban bfs 1. F… 399. dziban_b…
## 10 10 birdst… compa… bfs 1. F… 80.9 compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## # log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## # .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
find_extremum_predictive_data <- data_find_extremum %>%
add_fitted_draws(models$find_extremum, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$find_extremum <- find_extremum_predictive_data %>%
group_by(search, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = search) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$find_extremum$metric = "1. Find Extremum"
search_differences$find_extremum %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",search_differences$find_extremum[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
search_differences$find_extremum %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: search [1]
## search dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - bfs birdstrikes 2.12 -94.2 96.1 0.95 mean qi
## 2 dfs - bfs movies -38.9 -116. 33.1 0.95 mean qi
## 3 dfs - bfs birdstrikes 2.12 -28.7 33.3 0.5 mean qi
## 4 dfs - bfs movies -38.9 -63.8 -13.5 0.5 mean qi
oracle_differences$find_extremum <- find_extremum_predictive_data %>%
group_by(oracle, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = oracle) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$find_extremum$metric = "1. Find Extremum"
oracle_differences$find_extremum %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",oracle_differences$find_extremum[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
oracle_differences$find_extremum %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - compas… birdstrik… -40.8 -134. 52.5 0.95 mean qi
## 2 dziban - compas… movies -1.27 -75.1 71.1 0.95 mean qi
## 3 dziban - compas… birdstrik… -40.8 -70.9 -9.79 0.5 mean qi
## 4 dziban - compas… movies -1.27 -25.7 23.1 0.5 mean qi
data_retrieve_value <- subset(time_data, task == "2. Retrieve Value")
models$retrieve_value <- stan_glm(
log_completion_time ~ dataset * oracle * search,
data = data_retrieve_value,
prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
prior = normal(0, prior_sd, autoscale = FALSE),
seed = seed
)
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 2e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.116888 seconds (Warm-up)
## Chain 1: 0.091973 seconds (Sampling)
## Chain 1: 0.208861 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 1e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.1 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.094167 seconds (Warm-up)
## Chain 2: 0.088024 seconds (Sampling)
## Chain 2: 0.182191 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 1.6e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.16 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.097329 seconds (Warm-up)
## Chain 3: 0.08179 seconds (Sampling)
## Chain 3: 0.179119 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 1.2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.099922 seconds (Warm-up)
## Chain 4: 0.091463 seconds (Sampling)
## Chain 4: 0.191385 seconds (Total)
## Chain 4:
In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.
summary(models$retrieve_value)
##
## Model Info:
## function: stan_glm
## family: gaussian [identity]
## formula: log_completion_time ~ dataset * oracle * search
## algorithm: sampling
## sample: 4000 (posterior sample size)
## priors: see help('prior_summary')
## observations: 59
## predictors: 8
##
## Estimates:
## mean sd 10% 50% 90%
## (Intercept) 5.3 0.2 5.1 5.3 5.5
## datasetmovies 0.0 0.2 -0.3 0.0 0.2
## oracledziban 0.2 0.2 -0.1 0.2 0.4
## searchdfs 0.0 0.2 -0.2 0.0 0.3
## datasetmovies:oracledziban 0.0 0.3 -0.4 0.0 0.3
## datasetmovies:searchdfs 0.1 0.3 -0.2 0.1 0.4
## oracledziban:searchdfs -0.1 0.3 -0.5 -0.1 0.3
## datasetmovies:oracledziban:searchdfs -0.1 0.4 -0.6 -0.1 0.3
## sigma 0.5 0.0 0.4 0.5 0.5
##
## Fit Diagnostics:
## mean sd 10% 50% 90%
## mean_PPD 5.4 0.1 5.3 5.4 5.5
##
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
##
## MCMC diagnostics
## mcse Rhat n_eff
## (Intercept) 0.0 1.0 2258
## datasetmovies 0.0 1.0 2064
## oracledziban 0.0 1.0 1711
## searchdfs 0.0 1.0 2226
## datasetmovies:oracledziban 0.0 1.0 1668
## datasetmovies:searchdfs 0.0 1.0 2054
## oracledziban:searchdfs 0.0 1.0 1802
## datasetmovies:oracledziban:searchdfs 0.0 1.0 1905
## sigma 0.0 1.0 3660
## mean_PPD 0.0 1.0 3854
## log-posterior 0.1 1.0 1782
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
Trace plots help us check whether there is evidence of non-convergence for model.
plot(models$retrieve_value)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(models$retrieve_value)
Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data$retrieve_value <- data_retrieve_value %>%
add_fitted_draws(models$retrieve_value, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_retrieve_value = ggplot(draw_data$retrieve_value, aes(
x = exp(.value),
y = condition,
fill = dataset,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Time (Seconds)", y = "Condition")
plot_retrieve_value
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_retrieve_value <- draw_data$retrieve_value %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_retrieve_value
## # A tibble: 118 x 16
## # Groups: participant_id, dataset, oracle, search, task, completion_time,
## # condition, task_type, log_completion_time [59]
## participant_id dataset oracle search task completion_time condition
## <int> <fct> <fct> <fct> <chr> <dbl> <chr>
## 1 1 birdst… compa… bfs 2. R… 131. compassq…
## 2 2 birdst… compa… bfs 2. R… 182. compassq…
## 3 3 birdst… compa… bfs 2. R… 158. compassq…
## 4 4 movies compa… dfs 2. R… 274. compassq…
## 5 5 movies dziban bfs 2. R… 242. dziban_b…
## 6 6 birdst… compa… dfs 2. R… 191. compassq…
## 7 7 movies compa… dfs 2. R… 186. compassq…
## 8 8 movies dziban dfs 2. R… 375. dziban_d…
## 9 9 movies dziban bfs 2. R… 263. dziban_b…
## 10 10 birdst… compa… bfs 2. R… 366. compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## # log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## # .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
retrieve_value_predictive_data <- data_retrieve_value %>%
add_fitted_draws(models$retrieve_value, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$retrieve_value <- retrieve_value_predictive_data %>%
group_by(search, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = search) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$retrieve_value$metric = "2. Retrieve Value"
search_differences$retrieve_value %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",search_differences$retrieve_value[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
search_differences$retrieve_value %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: search [1]
## search dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - bfs birdstrikes -2.23 -76.1 73.4 0.95 mean qi
## 2 dfs - bfs movies 2.98 -66.4 70.5 0.95 mean qi
## 3 dfs - bfs birdstrikes -2.23 -28.1 22.3 0.5 mean qi
## 4 dfs - bfs movies 2.98 -18.9 26.1 0.5 mean qi
oracle_differences$retrieve_value <- retrieve_value_predictive_data %>%
group_by(oracle, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = oracle) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$retrieve_value$metric = "2. Retrieve Value"
oracle_differences$retrieve_value %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",oracle_differences$retrieve_value[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
oracle_differences$retrieve_value %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - compa… birdstrik… 26.1 -48.3 102. 0.95 mean qi
## 2 dziban - compa… movies -1.24 -71.5 67.6 0.95 mean qi
## 3 dziban - compa… birdstrik… 26.1 0.774 51.4 0.5 mean qi
## 4 dziban - compa… movies -1.24 -23.8 23.0 0.5 mean qi
data_prediction <- subset(time_data, task == "3. Prediction")
models$prediction <- stan_glm(
log_completion_time ~ dataset * oracle * search,
data = data_prediction,
prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
prior = normal(0, prior_sd, autoscale = FALSE),
seed = seed
)
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 2e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.125573 seconds (Warm-up)
## Chain 1: 0.121491 seconds (Sampling)
## Chain 1: 0.247064 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 3.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.36 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.111821 seconds (Warm-up)
## Chain 2: 0.101047 seconds (Sampling)
## Chain 2: 0.212868 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 1.2e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.118462 seconds (Warm-up)
## Chain 3: 0.132986 seconds (Sampling)
## Chain 3: 0.251448 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.098254 seconds (Warm-up)
## Chain 4: 0.08518 seconds (Sampling)
## Chain 4: 0.183434 seconds (Total)
## Chain 4:
In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.
summary(models$prediction)
##
## Model Info:
## function: stan_glm
## family: gaussian [identity]
## formula: log_completion_time ~ dataset * oracle * search
## algorithm: sampling
## sample: 4000 (posterior sample size)
## priors: see help('prior_summary')
## observations: 59
## predictors: 8
##
## Estimates:
## mean sd 10% 50% 90%
## (Intercept) 6.3 0.1 6.1 6.3 6.5
## datasetmovies 0.0 0.2 -0.2 0.0 0.3
## oracledziban 0.0 0.2 -0.2 0.0 0.2
## searchdfs 0.3 0.2 0.1 0.3 0.6
## datasetmovies:oracledziban 0.2 0.2 -0.1 0.2 0.5
## datasetmovies:searchdfs 0.1 0.2 -0.2 0.1 0.4
## oracledziban:searchdfs -0.3 0.3 -0.6 -0.3 0.1
## datasetmovies:oracledziban:searchdfs -0.2 0.3 -0.6 -0.2 0.2
## sigma 0.4 0.0 0.4 0.4 0.5
##
## Fit Diagnostics:
## mean sd 10% 50% 90%
## mean_PPD 6.4 0.1 6.4 6.5 6.5
##
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
##
## MCMC diagnostics
## mcse Rhat n_eff
## (Intercept) 0.0 1.0 2330
## datasetmovies 0.0 1.0 2082
## oracledziban 0.0 1.0 2138
## searchdfs 0.0 1.0 1919
## datasetmovies:oracledziban 0.0 1.0 2045
## datasetmovies:searchdfs 0.0 1.0 1680
## oracledziban:searchdfs 0.0 1.0 2100
## datasetmovies:oracledziban:searchdfs 0.0 1.0 1834
## sigma 0.0 1.0 2899
## mean_PPD 0.0 1.0 4243
## log-posterior 0.1 1.0 1565
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
Trace plots help us check whether there is evidence of non-convergence for model.
plot(models$prediction)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(models$prediction)
Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data$prediction <- data_prediction %>%
add_fitted_draws(models$prediction, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_prediction = ggplot(draw_data$prediction, aes(
x = exp(.value),
y = condition,
fill = dataset,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Time (Seconds)", y = "Condition")
plot_prediction
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_prediction <- draw_data$prediction %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_prediction
## # A tibble: 118 x 16
## # Groups: participant_id, dataset, oracle, search, task, completion_time,
## # condition, task_type, log_completion_time [59]
## participant_id dataset oracle search task completion_time condition
## <int> <fct> <fct> <fct> <chr> <dbl> <chr>
## 1 1 birdst… compa… bfs 3. P… 529. compassq…
## 2 2 birdst… compa… bfs 3. P… 558. compassq…
## 3 3 birdst… compa… bfs 3. P… 555. compassq…
## 4 4 movies compa… dfs 3. P… 1036. compassq…
## 5 5 movies dziban bfs 3. P… 659. dziban_b…
## 6 6 birdst… compa… dfs 3. P… 592. compassq…
## 7 7 movies compa… dfs 3. P… 537. compassq…
## 8 8 movies dziban dfs 3. P… 498. dziban_d…
## 9 9 movies dziban bfs 3. P… 1140. dziban_b…
## 10 10 birdst… compa… bfs 3. P… 712. compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## # log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## # .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
prediction_predictive_data <- data_prediction %>%
add_fitted_draws(models$prediction, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$prediction <- prediction_predictive_data %>%
group_by(search, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = search) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$prediction$metric = "3. Prediction"
search_differences$prediction %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",search_differences$prediction[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
search_differences$prediction %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: search [1]
## search dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - bfs birdstrikes 135. -41.2 314. 0.95 mean qi
## 2 dfs - bfs movies 107. -91.0 305. 0.95 mean qi
## 3 dfs - bfs birdstrikes 135. 74.5 195. 0.5 mean qi
## 4 dfs - bfs movies 107. 43.8 175. 0.5 mean qi
oracle_differences$prediction <- prediction_predictive_data %>%
group_by(oracle, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = oracle) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$prediction$metric = "3. Prediction"
oracle_differences$prediction %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",oracle_differences$prediction[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
oracle_differences$prediction %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - compas… birdstrik… -84.9 -273. 93.1 0.95 mean qi
## 2 dziban - compas… movies -52.7 -250. 143. 0.95 mean qi
## 3 dziban - compas… birdstrik… -84.9 -147. -23.2 0.5 mean qi
## 4 dziban - compas… movies -52.7 -118. 12.1 0.5 mean qi
data_exploration <- subset(time_data, task == "4. Exploration")
models$exploration <- stan_glm(
log_completion_time ~ dataset * oracle * search,
data = data_exploration,
prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
prior = normal(0, prior_sd, autoscale = FALSE),
seed = seed
)
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 2.2e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.22 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.141074 seconds (Warm-up)
## Chain 1: 0.112687 seconds (Sampling)
## Chain 1: 0.253761 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 1.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.16 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.113867 seconds (Warm-up)
## Chain 2: 0.110881 seconds (Sampling)
## Chain 2: 0.224748 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 1.9e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.19 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.118154 seconds (Warm-up)
## Chain 3: 0.133708 seconds (Sampling)
## Chain 3: 0.251862 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 1.3e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.13 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.110991 seconds (Warm-up)
## Chain 4: 0.113861 seconds (Sampling)
## Chain 4: 0.224852 seconds (Total)
## Chain 4:
In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.
summary(models$exploration)
##
## Model Info:
## function: stan_glm
## family: gaussian [identity]
## formula: log_completion_time ~ dataset * oracle * search
## algorithm: sampling
## sample: 4000 (posterior sample size)
## priors: see help('prior_summary')
## observations: 59
## predictors: 8
##
## Estimates:
## mean sd 10% 50% 90%
## (Intercept) 6.6 0.1 6.4 6.6 6.7
## datasetmovies 0.0 0.1 -0.1 0.0 0.2
## oracledziban -0.1 0.1 -0.3 -0.1 0.1
## searchdfs 0.0 0.1 -0.1 0.0 0.2
## datasetmovies:oracledziban 0.1 0.2 -0.2 0.1 0.3
## datasetmovies:searchdfs -0.2 0.2 -0.4 -0.2 0.0
## oracledziban:searchdfs 0.1 0.2 -0.1 0.1 0.3
## datasetmovies:oracledziban:searchdfs 0.0 0.3 -0.3 0.0 0.4
## sigma 0.3 0.0 0.2 0.3 0.3
##
## Fit Diagnostics:
## mean sd 10% 50% 90%
## mean_PPD 6.6 0.1 6.5 6.6 6.6
##
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
##
## MCMC diagnostics
## mcse Rhat n_eff
## (Intercept) 0.0 1.0 2294
## datasetmovies 0.0 1.0 1991
## oracledziban 0.0 1.0 1625
## searchdfs 0.0 1.0 1990
## datasetmovies:oracledziban 0.0 1.0 1496
## datasetmovies:searchdfs 0.0 1.0 1784
## oracledziban:searchdfs 0.0 1.0 1572
## datasetmovies:oracledziban:searchdfs 0.0 1.0 1602
## sigma 0.0 1.0 3187
## mean_PPD 0.0 1.0 3676
## log-posterior 0.1 1.0 1741
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
Trace plots help us check whether there is evidence of non-convergence for model.
plot(models$exploration)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(models$exploration)
Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data$exploration <- data_exploration %>%
add_fitted_draws(models$exploration, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_exploration = ggplot(draw_data$exploration, aes(
x = exp(.value),
y = condition,
fill = dataset,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Time (Seconds)", y = "Condition")
plot_exploration
We can get the numeric values of the interval boundaries shown above with mean_qi
fit_info_exploration <- draw_data$exploration %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_exploration
## # A tibble: 118 x 16
## # Groups: participant_id, dataset, oracle, search, task, completion_time,
## # condition, task_type, log_completion_time [59]
## participant_id dataset oracle search task completion_time condition
## <int> <fct> <fct> <fct> <chr> <dbl> <chr>
## 1 1 birdst… compa… bfs 4. E… 765. compassq…
## 2 2 birdst… compa… bfs 4. E… 682. compassq…
## 3 3 birdst… compa… bfs 4. E… 955. compassq…
## 4 4 movies compa… dfs 4. E… 950. compassq…
## 5 5 movies dziban bfs 4. E… 596. dziban_b…
## 6 6 birdst… compa… dfs 4. E… 597. compassq…
## 7 7 movies compa… dfs 4. E… 670. compassq…
## 8 8 movies dziban dfs 4. E… 880. dziban_d…
## 9 9 movies dziban bfs 4. E… 830. dziban_b…
## 10 10 birdst… compa… bfs 4. E… 914. compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## # log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## # .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
exploration_predictive_data <- data_exploration %>%
add_fitted_draws(models$exploration, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$exploration <- exploration_predictive_data %>%
group_by(search, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = search) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$exploration$metric = "4. Exploration"
search_differences$exploration %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",search_differences$exploration[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
search_differences$exploration %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: search [1]
## search dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - bfs birdstrikes 61.9 -87.5 214. 0.95 mean qi
## 2 dfs - bfs movies -70.1 -210. 76.9 0.95 mean qi
## 3 dfs - bfs birdstrikes 61.9 9.89 112. 0.5 mean qi
## 4 dfs - bfs movies -70.1 -117. -23.9 0.5 mean qi
oracle_differences$exploration <- exploration_predictive_data %>%
group_by(oracle, dataset, .draw) %>%
summarize(time = weighted.mean(exp(.value))) %>%
compare_levels(time, by = oracle) %>%
rename(diff_in_time = time)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$exploration$metric = "4. Exploration"
oracle_differences$exploration %>%
ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Completion Time (",oracle_differences$exploration[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
oracle_differences$exploration %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle dataset diff_in_time .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - compa… birdstrik… -31.0 -186. 118. 0.95 mean qi
## 2 dziban - compa… movies 38.7 -100. 177. 0.95 mean qi
## 3 dziban - compa… birdstrik… -31.0 -79.7 20.0 0.5 mean qi
## 4 dziban - compa… movies 38.7 -8.00 86.5 0.5 mean qi
Plot all of the posterior draws on one plot
all_draws <- rbind(draw_data$find_extremum, draw_data$retrieve_value, draw_data$prediction, draw_data$exploration)
plot = ggplot(all_draws, aes(
x = exp(.value),
y = task,
fill = search,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Time (Seconds)", y = "Task") + facet_grid(. ~ dataset)
## Saving 7 x 5 in image
Putting the all of the plots for search algorithm and oracle differences on the same plot:
combined_search_differences <- rbind(search_differences$find_extremum, search_differences$retrieve_value, search_differences$prediction, search_differences$exploration)
combined_search_differences$metric <- factor(combined_search_differences$metric, levels=rev(task_list))
search_differences_plot <- combined_search_differences %>%
ggplot(aes(x = diff_in_time, y = metric)) +
xlab(paste0("Time Difference (Seconds): ", combined_search_differences[1,'search'])) +
ylab("Task") +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
search_differences_plot
search_differences_plot_split_by_dataset <- combined_search_differences %>%
ggplot(aes(x = diff_in_time, y = metric, fill= dataset)) +
xlab(paste0("Time Difference (Seconds): ", combined_search_differences[1,'search'])) +
ylab("Task") +
facet_grid(. ~ dataset) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
search_differences_plot_split_by_dataset
Now for oracle differences
combined_oracle_differences <- rbind(oracle_differences$find_extremum, oracle_differences$retrieve_value, oracle_differences$prediction, oracle_differences$exploration)
combined_oracle_differences$metric <- factor(combined_oracle_differences$metric, levels=rev(task_list))
oracle_differences_plot <- combined_oracle_differences %>%
ggplot(aes(x = diff_in_time, y = metric)) +
xlab(paste0("Time Difference (Seconds): ", combined_oracle_differences[1,'oracle'])) +
ylab("Task") +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences_plot
oracle_differences_plot_split_by_dataset <- combined_oracle_differences %>%
ggplot(aes(x = diff_in_time, y = metric, fill= dataset)) +
xlab(paste0("Time Difference (Seconds): ", combined_oracle_differences[1,'oracle'])) +
ylab("Task") +
facet_grid(. ~ dataset) +
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences_plot_split_by_dataset